Differential abundance analysis and cell proportions

Load Libraries and set working directory

### 
libs<-c('scater','loomR','Seurat','patchwork','SeuratDisk','monocle3','ggpubr','cowplot','ggplot2','ggbeeswarm',
        'monocle3','limma','edgeR','fitdistrplus','factoextra','ggrepel','tidyverse','pheatmap','reshape2','ComplexHeatmap',
        "survminer","survival","ggalluvial",'gtsummary','variancePartition','DirichletReg','cowsay','emmeans')
lapply(libs, require, character.only = TRUE) ; rm(libs)
### 
setwd("/Users/gonzae34/Documents/projects_gnjatic/MMRF_projects/immune_atlas/")

Load data and/or tables

file_717_per_cell_md <- readRDS(file='/Users/gonzae34/Documents/projects_gnjatic/MMRF_projects/immune_atlas/RData/per_cell_md.rds')
#test <- readRDS(file='/Users/gonzae34/Documents/projects_gnjatic/MMRF_projects/immune_atlas/RData/717_per_cell_md.rds') # same file
#identical(rownames(test),rownames(file_717_per_cell_md)) # check

Data cleaning and preparation

### Mark doublets in a binary variable
file_717_per_cell_md$doublet_pred_edit <- file_717_per_cell_md$doublet_pred
file_717_per_cell_md$doublet_pred_edit[file_717_per_cell_md$doublet_pred_edit %in% c('Plasma_B','poss_singlet_cluster','singlet')] <- 'singlet'
### cells x sample
x <- melt(table(file_717_per_cell_md$sample_id))

Data distribution

ggarrange(
  ggplot(data=x)+aes(x='Samples',y=log10(value))+theme_classic()+geom_boxplot(),
  ggplot(data=x)+aes(y=log10(value))+theme_classic()+geom_density(fill='grey'),ncol=2,align='hv'
  )

Investigate data composition for Site x Batch

### Prepare data for composition
x <- file_717_per_cell_md[,which(colnames(file_717_per_cell_md) %in% c('sample_id','visit_type','siteXbatch'))]
rownames(x) <- NULL ; x <- unique(x)
x$site <- tidyr::separate(data.frame(x$siteXbatch),1, sep="_", c("a","b",'c','d'))$a
x$batch <- tidyr::separate(data.frame(x$siteXbatch),1, sep="_", c("a","b",'c','d'))$b
rownames(x) <- x$sample_id
x$sample_id <- NULL ; x$siteXbatch <- NULL

y <- melt(table(file_717_per_cell_md$sample_id))
y <- as.data.frame(y)
y <- y[order(y$value,decreasing = T),]
rownames(y) <- y$Var1
y$Var1 <- NULL

Verify and order data for correctness

identical(rownames(y),rownames(x))
## [1] FALSE
x<-x[match(rownames(y),rownames(x)),]
identical(rownames(y),rownames(x))
## [1] TRUE
row_ha = rowAnnotation(foo2 = runif(10), bar2 = anno_barplot(runif(10)))
row_ha = rowAnnotation(df=x,
                       col = list(visit_type = c("baseline_diagnosis" = "pink","other" = "lightgreen", 
                                                 "relapse_progression" = "orange", 
                                                 "remission_response" = "deepskyblue1","post_transplant" = "blue"),
                                  site = c("EMORY" = "darkred","MAYO" = "darkblue","MSSM" = "darkgreen","WUSTL" = "darkorange3"),
                                  batch = c("B1" = "grey25","B2" = "grey50","B3" = "grey75","B4" = "grey90")))
Heatmap(as.matrix(log10(y+1)), 
               col = colorRampPalette(c("steelblue","white","firebrick"))(255),
               show_row_names = FALSE , show_column_names = FALSE,
               column_names_rot = 45, 
               row_split = x$site ,right_annotation = row_ha,
               border = TRUE, use_raster = FALSE,
               name = "Log10(Cells)")

subcluster x sample

y <- table(file_717_per_cell_md$subcluster_V03072023,file_717_per_cell_md$sample_id)
class(y) <- 'matrix'

identical(colnames(y),rownames(x))
## [1] FALSE
x<-x[match(colnames(y),rownames(x)),]
identical(colnames(y),rownames(x))
## [1] TRUE
col_ha = rowAnnotation(df=x, col = list(visit_type = c("baseline_diagnosis" = "pink", 
                                                       "other" = "lightgreen", 
                                                       "relapse_progression" = "orange", 
                                                       "remission_response" = "deepskyblue1", 
                                                       "post_transplant" = "blue"),
                                        site = c("EMORY" = "darkred", 
                                                 "MAYO" = "darkblue", 
                                                 "MSSM" = "darkgreen", 
                                                 "WUSTL" = "darkorange3"),
                                        batch = c("B1" = "grey25", 
                                                  "B2" = "grey50", 
                                                  "B3" = "grey75", 
                                                  "B4" = "grey90")))

z <- data.frame(Compartment=rownames(y),Cluster=rownames(y))
z$Compartment[grep('Plasma',z$Compartment)]<- 'Plasma'
z$Compartment[grep('NkT',z$Compartment)]<- 'NkT'
z$Compartment[grep('Myeloid',z$Compartment)]<- 'Myeloid'
z$Compartment[grep('^BEry',z$Compartment)]<- 'BEry'
z$Compartment[grep('^Ery',z$Compartment)]<- 'Ery'
z$Compartment[grep('Full',z$Compartment)]<- 'Fibroblasts'
rownames(z) <- z$Cluster
z$Cluster <- NULL
Heatmap(t(as.matrix(log10(y+1))), 
        col = colorRampPalette(c("steelblue","white","firebrick"))(255),
        show_row_names = FALSE , show_column_names = FALSE, column_names_rot = 45, 
        row_split =  x$visit_type,  column_split = z$Compartment,
        right_annotation = col_ha,
        border = TRUE, use_raster = FALSE,
        name = "Log10(Cells)")

Variance Partition / Mixed effect model with multiple random effects

y <- table(file_717_per_cell_md$subcluster_V03072023,file_717_per_cell_md$sample_id)
class(y) <- 'matrix'

for(i in 1:ncol(y)){ y[,i] <- y[,i]/sum(y[,i]) }

x <- file_717_per_cell_md[,which(colnames(file_717_per_cell_md) %in% c('davies_based_risk','sample_id','visit_type','siteXbatch','progression_group'))]
rownames(x) <- NULL
x <- unique(x)
x$site <- tidyr::separate(data.frame(x$siteXbatch),1, sep="_", c("a","b",'c','d'))$a
x$batch <- tidyr::separate(data.frame(x$siteXbatch),1, sep="_", c("a","b",'c','d'))$b
rownames(x) <- x$sample_id
x$sample_id <- NULL ; x$siteXbatch <- NULL
identical(colnames(y),rownames(x))
## [1] FALSE
x<-x[match(colnames(y),rownames(x)),]
identical(colnames(y),rownames(x))
## [1] TRUE
### packages and multicore settings
libs<-c("variancePartition","doParallel","BiocParallel")
lapply(libs, require, character.only = TRUE)
## [[1]]
## [1] TRUE
## 
## [[2]]
## [1] TRUE
## 
## [[3]]
## [1] TRUE
registerDoParallel(makeCluster(10)) 

RISK (all clusters)

###
my_doublets <- unique(file_717_per_cell_md$subcluster_V03072023[file_717_per_cell_md$doublet_pred_edit %in% c('dblet_cluster','poss_dblet_cluster')])
### 4 variables
form <- ~ (1|davies_based_risk) + (1|visit_type) + (1|site) + (1|batch)
### run the model
variance_exprs_matrix <- fitExtractVarPartModel(y, form, x)
pdf(file='/Users/gonzae34/Documents/projects_gnjatic/MMRF_projects/immune_atlas/figures/review_diag_variance_confounders_risk.pdf',width = 4,height = 2.5)
plotVarPart( sortCols( variance_exprs_matrix ) ) +theme_classic()+ ggtitle( 'Variance') + theme(axis.text.x = element_text(angle = 60,hjust = 1))
dev.off()
## quartz_off_screen 
##                 2
plotVarPart( sortCols( variance_exprs_matrix ) ) +theme_classic()+ ggtitle( 'Variance') + theme(axis.text.x = element_text(angle = 60,hjust = 1))

Progression (all clusters)

### progression
form <- ~ (1|progression_group) + (1|visit_type) + (1|site) + (1|batch)
### run the model
variance_exprs_matrix <- fitExtractVarPartModel(y, form, x)

Variance Risk (No doublet clusters)

my_doublets <- as.character(unique(file_717_per_cell_md$subcluster_V03072023[which(!file_717_per_cell_md$doublet_pred_edit %in% 'singlet')]))
### 4 variables
form <- ~ (1|davies_based_risk) + (1|visit_type) + (1|site) + (1|batch)
### run the model
variance_exprs_matrix <- fitExtractVarPartModel(y[!rownames(y) %in% my_doublets,], form, x)
pdf(file='/Users/gonzae34/Documents/projects_gnjatic/MMRF_projects/immune_atlas/figures/review_diag_variance_confounders_risk_noDB.pdf',width = 4,height = 2.5)
plotVarPart( sortCols( variance_exprs_matrix ) ) +theme_classic()+ ggtitle( 'Variance') + theme(axis.text.x = element_text(angle = 60,hjust = 1))
dev.off()
## quartz_off_screen 
##                 2
plotVarPart( sortCols( variance_exprs_matrix ) ) +theme_classic()+ ggtitle( 'Variance') + theme(axis.text.x = element_text(angle = 60,hjust = 1))

Variance Progression (No doublet clusters)

### progression
form <- ~ (1|progression_group) + (1|visit_type) + (1|site) + (1|batch)
### run the model
variance_exprs_matrix <- fitExtractVarPartModel(y[!rownames(y) %in% my_doublets,], form, x)
plotVarPart( sortCols( variance_exprs_matrix ) ) +theme_classic()+ ggtitle( 'Variance') + theme(axis.text.x = element_text(angle = 60,hjust = 1))

DAA Progression (All samples)

y <- table(file_717_per_cell_md$subcluster_V03072023,file_717_per_cell_md$sample_id)
class(y) <- 'matrix'

#y <- log2(y+1)
for(i in 1:ncol(y)){ y[,i] <- y[,i]/sum(y[,i]) }
  
x <- file_717_per_cell_md[,which(colnames(file_717_per_cell_md) %in% c('davies_based_risk','sample_id','visit_type','siteXbatch','progression_group'))]
rownames(x) <- NULL
x <- unique(x)
x$site <- tidyr::separate(data.frame(x$siteXbatch),1, sep="_", c("a","b",'c','d'))$a
x$batch <- tidyr::separate(data.frame(x$siteXbatch),1, sep="_", c("a","b",'c','d'))$b
rownames(x) <- x$sample_id
x$sample_id <- NULL ; x$siteXbatch <- NULL

identical(colnames(y),rownames(x))
## [1] FALSE
x<-x[match(colnames(y),rownames(x)),]
identical(colnames(y),rownames(x))
## [1] TRUE
### subset
ix <- which(x$davies_based_risk %in% c('high_risk','standard_risk') & x$visit_type %in% c('baseline_diagnosis'))
x <- x[ix,]
y <- y[,ix]

### Design 
form <- ~ 0 + progression_group + (1|site) + (1|batch)
### no weights because data is already normalized
L =  makeContrastsDream( form, x, 
                         contrasts = c(#'progression_groupInc - progression_groupNP','progression_groupP - progression_groupNP',
                                       'progression_groupRP - progression_groupNP'))
### fit with contrasts
fitmm = dream( y, form, x, L)
### Store all outcome statistics
lmfreq_results <- list() ; for ( i in 1 ) { lmfreq_results[[i]] <- topTable(fitmm, coef=i,n=Inf,adjust.method="fdr") 
lmfreq_results[[i]]$Comparison <- colnames(fitmm$coefficients)[i]
lmfreq_results[[i]]$Marker <- rownames( topTable(fitmm, coef=i,n=Inf,adjust.method="fdr") ) }
progression_res <- do.call(rbind,lmfreq_results)
rownames(progression_res) <- NULL

progression_res$nLogFDR <- -log10(progression_res$adj.P.Val)
progression_res$Comparison <- gsub('progression_group','',progression_res$Comparison)

DAA Risk (all clusters)

### Risk
y <- table(file_717_per_cell_md$subcluster_V03072023,file_717_per_cell_md$sample_id)
class(y) <- 'matrix'

#y <- log2(y+1)
for(i in 1:ncol(y)){ y[,i] <- y[,i]/sum(y[,i]) }

x <- file_717_per_cell_md[,which(colnames(file_717_per_cell_md) %in% c('davies_based_risk','sample_id','visit_type','siteXbatch','progression_group'))]
rownames(x) <- NULL
x <- unique(x)
x$site <- tidyr::separate(data.frame(x$siteXbatch),1, sep="_", c("a","b",'c','d'))$a
x$batch <- tidyr::separate(data.frame(x$siteXbatch),1, sep="_", c("a","b",'c','d'))$b
rownames(x) <- x$sample_id
x$sample_id <- NULL ; x$siteXbatch <- NULL

identical(colnames(y),rownames(x))
## [1] FALSE
x<-x[match(colnames(y),rownames(x)),]
identical(colnames(y),rownames(x))
## [1] TRUE
### subset
ix <- which(x$davies_based_risk %in% c('high_risk','standard_risk') & x$visit_type %in% c('baseline_diagnosis'))
x <- x[ix,]
y <- y[,ix]

### Design 
form <- ~ 0 + davies_based_risk + (1|site) + (1|batch)
### no weights because data is already normalized
L =  makeContrastsDream( form, 
                         x, 
                         contrasts = c('davies_based_riskhigh_risk - davies_based_riskstandard_risk'))
### fit with contrasts
fitmm = dream( y, form, x, L)
### Store all outcome statistics
lmfreq_results <- list() ; for ( i in 1 ) { lmfreq_results[[i]] <- topTable(fitmm, coef=i,n=Inf,adjust.method="fdr") 
lmfreq_results[[i]]$Comparison <- colnames(fitmm$coefficients)[i]
lmfreq_results[[i]]$Marker <- rownames( topTable(fitmm, coef=i,n=Inf,adjust.method="fdr") ) }
risk_res <- do.call(rbind,lmfreq_results)
rownames(risk_res) <- NULL

risk_res$nLogFDR <- -log10(risk_res$adj.P.Val)
risk_res$Comparison <- gsub('davies_based_risk','',risk_res$Comparison)

DAA Progression (No Doublet clusters)

y <- table(file_717_per_cell_md$subcluster_V03072023,file_717_per_cell_md$sample_id)
class(y) <- 'matrix'
  
x <- file_717_per_cell_md[,which(colnames(file_717_per_cell_md) %in% c('davies_based_risk','sample_id','visit_type','siteXbatch','progression_group'))]
rownames(x) <- NULL
x <- unique(x)
x$site <- tidyr::separate(data.frame(x$siteXbatch),1, sep="_", c("a","b",'c','d'))$a
x$batch <- tidyr::separate(data.frame(x$siteXbatch),1, sep="_", c("a","b",'c','d'))$b
rownames(x) <- x$sample_id
x$sample_id <- NULL ; x$siteXbatch <- NULL

identical(colnames(y),rownames(x))
## [1] FALSE
x<-x[match(colnames(y),rownames(x)),]
identical(colnames(y),rownames(x))
## [1] TRUE
### subset
ix <- which(x$davies_based_risk %in% c('high_risk','standard_risk') & x$visit_type %in% c('baseline_diagnosis'))
x <- x[ix,]
y <- y[,ix]
y <- y[!rownames(y) %in% my_doublets,]
#y <- log2(y+1)
for(i in 1:ncol(y)){ y[,i] <- y[,i]/sum(y[,i]) }


### Design 
form <- ~ 0 + progression_group  + (1|site) + (1|batch)
### no weights because data is already normalized
L =  makeContrastsDream( form, x, 
                         contrasts = c(#'progression_groupInc - progression_groupNP','progression_groupP - progression_groupNP',
                                       'progression_groupRP - progression_groupNP'))
### fit with contrasts
fitmm = dream( y, form, x, L)
### Store all outcome statistics
lmfreq_results <- list() ; for ( i in 1:1 ) { lmfreq_results[[i]] <- topTable(fitmm, coef=i,n=Inf,adjust.method="fdr") 
lmfreq_results[[i]]$Comparison <- colnames(fitmm$coefficients)[i]
lmfreq_results[[i]]$Marker <- rownames( topTable(fitmm, coef=i,n=Inf,adjust.method="fdr") ) }
progression_res_ND <- do.call(rbind,lmfreq_results)
rownames(progression_res_ND) <- NULL

progression_res_ND$nLogFDR <- -log10(progression_res_ND$adj.P.Val)
progression_res_ND$Comparison <- gsub('progression_group','',progression_res_ND$Comparison)

DAA Risk (No Doublet clusters)

### Risk
y <- table(file_717_per_cell_md$subcluster_V03072023,file_717_per_cell_md$sample_id)
class(y) <- 'matrix'

x <- file_717_per_cell_md[,which(colnames(file_717_per_cell_md) %in% c('davies_based_risk','sample_id','visit_type','siteXbatch','progression_group'))]
rownames(x) <- NULL
x <- unique(x)
x$site <- tidyr::separate(data.frame(x$siteXbatch),1, sep="_", c("a","b",'c','d'))$a
x$batch <- tidyr::separate(data.frame(x$siteXbatch),1, sep="_", c("a","b",'c','d'))$b
rownames(x) <- x$sample_id
x$sample_id <- NULL ; x$siteXbatch <- NULL

identical(colnames(y),rownames(x))
## [1] FALSE
x<-x[match(colnames(y),rownames(x)),]
identical(colnames(y),rownames(x))
## [1] TRUE
### subset
ix <- which(x$davies_based_risk %in% c('high_risk','standard_risk') & x$visit_type %in% c('baseline_diagnosis'))
x <- x[ix,]
y <- y[,ix]
y <- y[!rownames(y) %in% my_doublets,]
#y <- log2(y+1)
for(i in 1:ncol(y)){ y[,i] <- y[,i]/sum(y[,i]) }

### Design 
form <- ~ 0 + davies_based_risk + (1|site) + (1|batch)
### no weights because data is already normalized
L =  makeContrastsDream( form, 
                         x, 
                         contrasts = c('davies_based_riskhigh_risk - davies_based_riskstandard_risk'))
### fit with contrasts
fitmm = dream( y, form, x, L)
### Store all outcome statistics
lmfreq_results <- list() ; for ( i in 1 ) { lmfreq_results[[i]] <- topTable(fitmm, coef=i,n=Inf,adjust.method="fdr") 
lmfreq_results[[i]]$Comparison <- colnames(fitmm$coefficients)[i]
lmfreq_results[[i]]$Marker <- rownames( topTable(fitmm, coef=i,n=Inf,adjust.method="fdr") ) }
risk_res_ND <- do.call(rbind,lmfreq_results)
rownames(risk_res_ND) <- NULL

risk_res_ND$nLogFDR <- -log10(risk_res_ND$adj.P.Val)
risk_res_ND$Comparison <- gsub('davies_based_risk','',risk_res_ND$Comparison)

Comparison between all clusters and no doublets (ND) for Risk

risk_res_tmp <- risk_res[which(risk_res$Marker %in% risk_res_ND$Marker),]
identical(risk_res_tmp$Marker,risk_res_ND$Marker)
## [1] FALSE
a <-risk_res_tmp ; b <- risk_res_ND
cor.test(a$AveExpr,b$AveExpr,method='pearson')
## 
##  Pearson's product-moment correlation
## 
## data:  a$AveExpr and b$AveExpr
## t = 1.5683, df = 87, p-value = 0.1204
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.0439650  0.3615809
## sample estimates:
##       cor 
## 0.1658099
cor.test(a$logFC,b$logFC,method='pearson')
## 
##  Pearson's product-moment correlation
## 
## data:  a$logFC and b$logFC
## t = 3.9712, df = 87, p-value = 0.0001469
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1997734 0.5547335
## sample estimates:
##       cor 
## 0.3917325
cor.test(a$adj.P.Val,b$adj.P.Val,method='pearson')
## 
##  Pearson's product-moment correlation
## 
## data:  a$adj.P.Val and b$adj.P.Val
## t = 34.406, df = 87, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9473180 0.9770337
## sample estimates:
##       cor 
## 0.9651623
cor.test(a$P.Value,b$P.Value,method='pearson')
## 
##  Pearson's product-moment correlation
## 
## data:  a$P.Value and b$P.Value
## t = 124.84, df = 87, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9957610 0.9981776
## sample estimates:
##       cor 
## 0.9972203
x <- data.frame(c=b$Marker,logfc_nd=b$logFC,adpval_nd=b$adj.P.Val,m='ND')
y <- data.frame(c=a$Marker,logfc=a$logFC,adpval=a$adj.P.Val,m='all')
z<-merge(x,y,by='c')
ggplot(data=z)+aes(x=logfc_nd,y=logfc) + theme_classic()+geom_point()

ggplot(data=z)+aes(x=adpval_nd,y=adpval) + theme_classic()+geom_point()

Comparison between all clusters and no doublets (ND) for Risk

progression_res_tmp <- progression_res[which(progression_res$Marker %in% progression_res_ND$Marker),]
a <- progression_res_tmp[progression_res_tmp$Comparison %in% 'RP - NP',]
b <- progression_res_ND[progression_res_ND$Comparison %in% 'RP - NP',]
identical(a$Marker,b$Marker)
## [1] FALSE
cor.test(a$AveExpr,b$AveExpr,method='pearson')
## 
##  Pearson's product-moment correlation
## 
## data:  a$AveExpr and b$AveExpr
## t = 1.8403, df = 87, p-value = 0.06913
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.01530472  0.38625499
## sample estimates:
##      cor 
## 0.193569
cor.test(a$logFC,b$logFC,method='pearson')
## 
##  Pearson's product-moment correlation
## 
## data:  a$logFC and b$logFC
## t = -2.062, df = 87, p-value = 0.04219
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.405871558 -0.007961722
## sample estimates:
##        cor 
## -0.2158605
cor.test(a$adj.P.Val,b$adj.P.Val,method='pearson')
## 
##  Pearson's product-moment correlation
## 
## data:  a$adj.P.Val and b$adj.P.Val
## t = 73.841, df = 87, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9879936 0.9948269
## sample estimates:
##       cor 
## 0.9921162
cor.test(a$P.Value,b$P.Value,method='pearson')
## 
##  Pearson's product-moment correlation
## 
## data:  a$P.Value and b$P.Value
## t = 197.06, df = 87, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9982940 0.9992671
## sample estimates:
##       cor 
## 0.9988817

DAA Risk (No Doublet clusters) # Per compartment as 100%

### Risk
y <- table(file_717_per_cell_md$subcluster_V03072023,file_717_per_cell_md$sample_id)
class(y) <- 'matrix'
  
x <- file_717_per_cell_md[,which(colnames(file_717_per_cell_md) %in% c('davies_based_risk','sample_id','visit_type','siteXbatch','progression_group'))]
rownames(x) <- NULL
x <- unique(x)
x$site <- tidyr::separate(data.frame(x$siteXbatch),1, sep="_", c("a","b",'c','d'))$a
x$batch <- tidyr::separate(data.frame(x$siteXbatch),1, sep="_", c("a","b",'c','d'))$b
rownames(x) <- x$sample_id
x$sample_id <- NULL ; x$siteXbatch <- NULL

x<-x[match(colnames(y),rownames(x)),]

### Subset
ix <- which(x$davies_based_risk %in% c('high_risk','standard_risk') & x$visit_type %in% c('baseline_diagnosis'))
x <- x[ix,]
y <- y[,ix]
y <- y[!rownames(y) %in% my_doublets,]

Compartment <- c('NkT','BEry','Ery','Myeloid','Plasma') #,'Full'
res <- list()
for(iii in Compartment){
  
  ix <- grep(paste('^',iii,sep=''), rownames(y))
  z <- y[ix,]
  z[z==0] <- 0.0001
  for(i in 1:ncol(z)){ z[,i] <- z[,i]/sum(z[,i]) } 
  
  ### Design 
  form <- ~ 0 + davies_based_risk + (1|site) + (1|batch)
  
  ### no weights because data is already normalized
  L =  makeContrastsDream( form, x, 
                         contrasts = c('davies_based_riskstandard_risk - davies_based_riskhigh_risk'))
  
  ### fit with contrasts
  fitmm = dream( z, form, x, L)
  ###
  a <- topTable(fitmm, coef=1,n=Inf,adjust.method="fdr") 
  a$Comparison <- colnames(fitmm$coefficients)[1]
  a$Marker <- rownames( topTable(fitmm, coef=1,n=Inf,adjust.method="fdr") ) 
  a$nLogFDR <- -log10(a$adj.P.Val)
  a$Comparison <- gsub('davies_based_risk','',a$Comparison)
  res[[iii]] <- a
}
res_compartment_nd <- do.call(rbind,res)
rownames(res_compartment_nd) <- res_compartment_nd$Marker
risk_res_tmp <- risk_res_ND[which(risk_res_ND$Marker %in% res_compartment_nd$Marker),]
risk_res_tmp <- risk_res_tmp[match(res_compartment_nd$Marker,risk_res_tmp$Marker),]
identical(res_compartment_nd$Marker,risk_res_tmp$Marker)
## [1] TRUE
a <-risk_res_tmp ; b <- res_compartment_nd
a <-risk_res_tmp ; b <- res_compartment_nd
cor.test(a$AveExpr,b$AveExpr,method='pearson')
## 
##  Pearson's product-moment correlation
## 
## data:  a$AveExpr and b$AveExpr
## t = 5.2701, df = 86, p-value = 9.978e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3174936 0.6375489
## sample estimates:
##       cor 
## 0.4940811
cor.test(a$logFC,b$logFC,method='pearson')
## 
##  Pearson's product-moment correlation
## 
## data:  a$logFC and b$logFC
## t = -4.5239, df = 86, p-value = 1.933e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.5933895 -0.2521483
## sample estimates:
##        cor 
## -0.4384365
cor.test(a$adj.P.Val,b$adj.P.Val,method='pearson')
## 
##  Pearson's product-moment correlation
## 
## data:  a$adj.P.Val and b$adj.P.Val
## t = 3.0381, df = 86, p-value = 0.003153
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1089856 0.4888865
## sample estimates:
##       cor 
## 0.3113216
cor.test(a$P.Value,b$P.Value,method='pearson')
## 
##  Pearson's product-moment correlation
## 
## data:  a$P.Value and b$P.Value
## t = 4.6044, df = 86, p-value = 1.42e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2594278 0.5984127
## sample estimates:
##       cor 
## 0.4447069
x <- data.frame(c=b$Marker,logfc_nd=b$logFC,adpval_nd=b$adj.P.Val,m='ND')
y <- data.frame(c=a$Marker,logfc=a$logFC,adpval=a$adj.P.Val,m='all')
z<-merge(x,y,by='c')
ggplot(data=z)+aes(x=logfc_nd,y=logfc) + theme_classic()+geom_point()

ggplot(data=z)+aes(x=adpval_nd,y=adpval) + theme_classic()+geom_point()

Risk for DREG (all cells)

y <- table(file_717_per_cell_md$subcluster_V03072023,file_717_per_cell_md$sample_id)
class(y) <- 'matrix'
#y <- log2(y+1)

x <- file_717_per_cell_md[,which(colnames(file_717_per_cell_md) %in% c('davies_based_risk','sample_id','visit_type','siteXbatch','progression_group'))]
rownames(x) <- NULL
x <- unique(x)
x$site <- tidyr::separate(data.frame(x$siteXbatch),1, sep="_", c("a","b",'c','d'))$a
x$batch <- tidyr::separate(data.frame(x$siteXbatch),1, sep="_", c("a","b",'c','d'))$b
rownames(x) <- x$sample_id
x$sample_id <- NULL ; x$siteXbatch <- NULL

identical(colnames(y),rownames(x))
## [1] FALSE
x<-x[match(colnames(y),rownames(x)),]
identical(colnames(y),rownames(x))
## [1] TRUE
### subset
ix <- which(x$davies_based_risk %in% c('high_risk','standard_risk') & x$visit_type %in% c('baseline_diagnosis'))
x <- x[ix,]
y <- y[,ix]

# to the transformation proposed by Smithson and Verkuilen (2006)
cell_proportions = DR_data( t(y ))
x$site_batch <- paste(x$site,x$batch,sep='_')
x$davies_based_risk <- factor(x$davies_based_risk,levels=c('standard_risk','high_risk'))
dr_fit_common <- DirichReg( cell_proportions ~ davies_based_risk + site_batch, x, model = "common" )

u = summary(dr_fit_common)
pvals = u$coef.mat[grep('davies_based_riskhigh_risk', rownames(u$coef.mat), invert=FALSE), 4]
v = names(pvals)

prob.ratio = exp( summary(dr_fit_common)$coefficients[paste0(rownames(y),":davies_based_riskhigh_risk")] )

pvals = matrix(pvals, ncol=length(u$varnames))
rownames(pvals) = gsub('davies_based_riskhigh_risk', '', v[1:nrow(pvals)])
colnames(pvals) = u$varnames

dirichlet_res <- data.frame(log2fc = log2(exp( summary(dr_fit_common)$coefficients[paste0(rownames(y),":davies_based_riskhigh_risk")] )) , 
                            pval=colMeans(pvals))

rownames(dirichlet_res) <- gsub(':davies_based_riskhigh_risk','',rownames(dirichlet_res))
dirichlet_res$Marker <- rownames(dirichlet_res)
risk_res_tmp <- dirichlet_res[which(dirichlet_res$Marker %in% risk_res$Marker),]
risk_res_tmp <- risk_res_tmp[match(risk_res$Marker,risk_res_tmp$Marker),]
identical(risk_res_tmp$Marker,risk_res$Marker)
## [1] TRUE
a <-risk_res_tmp ; b <- risk_res
cor.test(a$log2fc,b$logFC,method = 'pearson')
## 
##  Pearson's product-moment correlation
## 
## data:  a$log2fc and b$logFC
## t = 7.8663, df = 104, p-value = 3.601e-12
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4753991 0.7178839
## sample estimates:
##       cor 
## 0.6107671
cor.test(a$pval,b$P.Value,method = 'pearson')
## 
##  Pearson's product-moment correlation
## 
## data:  a$pval and b$P.Value
## t = -0.23296, df = 104, p-value = 0.8162
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2126669  0.1686522
## sample estimates:
##         cor 
## -0.02283798

comparing both models Dream vs Dirichlet

ggplot()+aes(x=a$log2fc,y=b$logFC*100)+geom_point() + theme_classic() + 
  geom_hline(yintercept = 0, linetype=2,color='deeppink') + geom_vline(xintercept = 0, linetype=2,color='deeppink') +
  geom_smooth(color='skyblue1',method = 'glm')+labs(x='Dirichlet',y='Dream')
## `geom_smooth()` using formula = 'y ~ x'

Risk for DREG (no doublets)

y <- table(file_717_per_cell_md$subcluster_V03072023,file_717_per_cell_md$sample_id)
class(y) <- 'matrix'
#y <- log2(y+1)

x <- file_717_per_cell_md[,which(colnames(file_717_per_cell_md) %in% c('davies_based_risk','sample_id','visit_type','siteXbatch','progression_group'))]
rownames(x) <- NULL
x <- unique(x)
x$site <- tidyr::separate(data.frame(x$siteXbatch),1, sep="_", c("a","b",'c','d'))$a
x$batch <- tidyr::separate(data.frame(x$siteXbatch),1, sep="_", c("a","b",'c','d'))$b
rownames(x) <- x$sample_id
x$sample_id <- NULL ; x$siteXbatch <- NULL

identical(colnames(y),rownames(x))
## [1] FALSE
x<-x[match(colnames(y),rownames(x)),]
identical(colnames(y),rownames(x))
## [1] TRUE
### subset
ix <- which(x$davies_based_risk %in% c('high_risk','standard_risk') & x$visit_type %in% c('baseline_diagnosis'))
x <- x[ix,]
y <- y[,ix]
y <- y[!rownames(y) %in% my_doublets,]

# to the transformation proposed by Smithson and Verkuilen (2006)
cell_proportions = DR_data( t(y ))
x$site_batch <- paste(x$site,x$batch,sep='_')
x$davies_based_risk <- factor(x$davies_based_risk,levels=c('standard_risk','high_risk'))
dr_fit_common <- DirichReg( cell_proportions ~ davies_based_risk + site_batch, x, model = "common" )

u = summary(dr_fit_common)
pvals = u$coef.mat[grep('davies_based_riskhigh_risk', rownames(u$coef.mat), invert=FALSE), 4]
v = names(pvals)

prob.ratio = exp( summary(dr_fit_common)$coefficients[paste0(rownames(y),":davies_based_riskhigh_risk")] )

pvals = matrix(pvals, ncol=length(u$varnames))
rownames(pvals) = gsub('davies_based_riskhigh_risk', '', v[1:nrow(pvals)])
colnames(pvals) = u$varnames

dirichlet_ND_res <- data.frame(log2fc = log2(exp( summary(dr_fit_common)$coefficients[paste0(rownames(y),":davies_based_riskhigh_risk")] )) , 
                            pval=colMeans(pvals))

rownames(dirichlet_ND_res) <- gsub(':davies_based_riskhigh_risk','',rownames(dirichlet_ND_res))
dirichlet_ND_res$Marker <- rownames(dirichlet_ND_res)
risk_res_tmp <- dirichlet_ND_res[which(dirichlet_ND_res$Marker %in% risk_res_ND$Marker),]
risk_res_tmp <- risk_res_tmp[match(risk_res_ND$Marker,risk_res_tmp$Marker),]
identical(risk_res_tmp$Marker,risk_res_ND$Marker)
## [1] TRUE
a <-risk_res_tmp ; b <- risk_res_ND
ggplot()+aes(x=a$log2fc,y=b$logFC*100)+geom_point() + theme_classic() + 
  geom_hline(yintercept = 0, linetype=2,color='deeppink') + geom_vline(xintercept = 0, linetype=2,color='deeppink') +
  geom_smooth(color='skyblue1',method = 'glm')+labs(x='Dirichlet',y='Dream')
## `geom_smooth()` using formula = 'y ~ x'

###
my_cells <- rownames(y)
###
subcluster_V2_df <- melt(table(file_717_per_cell_md$subcluster_V03072023,
                               file_717_per_cell_md$sample_id,
                               file_717_per_cell_md$doublet_pred_edit,
                               file_717_per_cell_md$visit_type,
                               file_717_per_cell_md$davies_based_risk,
                               file_717_per_cell_md$siteXbatch))
colnames(subcluster_V2_df) <- c('Subcluster','sample_id','doublet','visit','risk','batch_site','cells')

pdf(file='/Users/gonzae34/Documents/projects_gnjatic/MMRF_projects/immune_atlas/figures/review_diag_boxplots_cell_cluster_per_batch_site_baseline.pdf',width = 3,height = 7)
for(i in 1:length(my_cells)){

x <- subcluster_V2_df[subcluster_V2_df$Subcluster %in% my_cells[i] & 
                        subcluster_V2_df$risk %in% c('high_risk','standard_risk') & 
                        subcluster_V2_df$visit %in% c('baseline_diagnosis'),]

x$log10cells <- log10(x$cells+1)
x <- x[x$log10cells>0,]

print(
ggplot(data=x)+aes(x=risk,y=log10cells)+geom_boxplot(outlier.size = NA,outlier.shape =NA)+
  theme_classic()+geom_quasirandom()+
  labs(title = paste(my_cells[i]))+facet_wrap(~batch_site,ncol=3)+
  rotate_x_text(angle = 90)
)
}
dev.off()
## quartz_off_screen 
##                 2
for(i in 1:length(my_cells)){

x <- subcluster_V2_df[subcluster_V2_df$Subcluster %in% my_cells[i] & 
                        subcluster_V2_df$risk %in% c('high_risk','standard_risk') & 
                        subcluster_V2_df$visit %in% c('baseline_diagnosis'),]

x$log10cells <- log10(x$cells+1)
x <- x[x$log10cells>0,]

print(
ggplot(data=x)+aes(x=risk,y=log10cells)+geom_boxplot(outlier.size = NA,outlier.shape =NA)+
  theme_classic()+geom_quasirandom()+
  labs(title = paste(my_cells[i]))+facet_wrap(~batch_site,ncol=3)+
  rotate_x_text(angle = 90)
)
}

Risk for DREG by compartment (no doublets)

Duplicated code (identical to Yered’s). Used for comparison purposes.

y <- table(file_717_per_cell_md$subcluster_V03072023,file_717_per_cell_md$sample_id)
class(y) <- 'matrix'
#y <- log2(y+1)

x <- file_717_per_cell_md[,which(colnames(file_717_per_cell_md) %in% c('davies_based_risk','sample_id','visit_type','siteXbatch','progression_group'))]
rownames(x) <- NULL
x <- unique(x)
x$site <- tidyr::separate(data.frame(x$siteXbatch),1, sep="_", c("a","b",'c','d'))$a
x$batch <- tidyr::separate(data.frame(x$siteXbatch),1, sep="_", c("a","b",'c','d'))$b
rownames(x) <- x$sample_id
x$sample_id <- NULL ; x$siteXbatch <- NULL

identical(colnames(y),rownames(x))
## [1] FALSE
x<-x[match(colnames(y),rownames(x)),]
identical(colnames(y),rownames(x))
## [1] TRUE
### subset
ix <- which(x$davies_based_risk %in% c('high_risk','standard_risk') & x$visit_type %in% c('baseline_diagnosis'))
x <- x[ix,]
y <- y[,ix]
y <- y[!rownames(y) %in% my_doublets,]

my_res_list<-list()
Compartment <- c('NkT','BEry','Ery','Myeloid','Plasma') #,'Full'
res <- list()
for(iii in Compartment){
  
  ix <- grep(paste('^',iii,sep=''), rownames(y))
  z <- y[ix,]
  
# to the transformation proposed by Smithson and Verkuilen (2006)
cell_proportions = DR_data(t(z))
x$site_batch <- paste(x$site,x$batch,sep='_')
x$davies_based_risk <- factor(x$davies_based_risk,levels=c('standard_risk','high_risk'))
dr_fit_common <- DirichReg( cell_proportions ~ davies_based_risk + site_batch, x, model = "common" )

u = summary(dr_fit_common)
pvals = u$coef.mat[grep('davies_based_riskhigh_risk', rownames(u$coef.mat), invert=FALSE), 4]
v = names(pvals)

prob.ratio = exp( summary(dr_fit_common)$coefficients[paste0(rownames(z),":davies_based_riskhigh_risk")] )

pvals = matrix(pvals, ncol=length(u$varnames))
rownames(pvals) = gsub('davies_based_riskhigh_risk', '', v[1:nrow(pvals)])
colnames(pvals) = u$varnames

dirichlet_ND_comp_res <- data.frame(log2fc = log2(exp( summary(dr_fit_common)$coefficients[paste0(rownames(z),":davies_based_riskhigh_risk")] )) , 
                               pval=colMeans(pvals),
                               Marker=rownames(z))
rownames(dirichlet_ND_comp_res) <- gsub(':davies_based_riskhigh_risk','',rownames(dirichlet_ND_comp_res))

my_res_list[[iii]] <- dirichlet_ND_comp_res
}
dirichlet_ND_comp_res <- do.call(rbind,my_res_list)

Test difference between Compartment Normalization vs All counts.

dirichlet_ND_res$Marker <- rownames(dirichlet_ND_res)
risk_res_tmp <- dirichlet_ND_res[which(dirichlet_ND_res$Marker %in% dirichlet_ND_comp_res$Marker),]
risk_res_tmp <- risk_res_tmp[match(dirichlet_ND_comp_res$Marker,risk_res_tmp$Marker),]
identical(risk_res_tmp$Marker,dirichlet_ND_comp_res$Marker)
## [1] TRUE
a <-risk_res_tmp ; b <- dirichlet_ND_comp_res
cor.test(a$log2fc,b$log2fc,method='pearson')
## 
##  Pearson's product-moment correlation
## 
## data:  a$log2fc and b$log2fc
## t = 16.866, df = 86, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8167328 0.9173593
## sample estimates:
##       cor 
## 0.8762801
cor.test(a$pval,b$pval,method='pearson')
## 
##  Pearson's product-moment correlation
## 
## data:  a$pval and b$pval
## t = 12.944, df = 86, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.7272917 0.8736110
## sample estimates:
##       cor 
## 0.8129073

Similarity between logfc and pvalue falls into 82% and 81% agreement respective.

Store data.

saveRDS(file='/Users/gonzae34/Documents/projects_gnjatic/MMRF_projects/immune_atlas/RData/results_diag_risk_res_ND_DAA_dream.RDS',risk_res_ND)
saveRDS(file='/Users/gonzae34/Documents/projects_gnjatic/MMRF_projects/immune_atlas/RData/results_diag_prog_res_ND_DAA_dream.RDS',progression_res_ND)
saveRDS(file='/Users/gonzae34/Documents/projects_gnjatic/MMRF_projects/immune_atlas/RData/results_diag_risk_res_DAA_dream.RDS',risk_res)
saveRDS(file='/Users/gonzae34/Documents/projects_gnjatic/MMRF_projects/immune_atlas/RData/results_diag_prog_res_DAA_dream.RDS',progression_res)
saveRDS(file='/Users/gonzae34/Documents/projects_gnjatic/MMRF_projects/immune_atlas/RData/results_diag_res_compartment_nd_DAA_dream.RDS',res_compartment_nd)
saveRDS(file='/Users/gonzae34/Documents/projects_gnjatic/MMRF_projects/immune_atlas/RData/results_diag_dirichlet_res_DAA_dream.RDS',dirichlet_res)
saveRDS(file='/Users/gonzae34/Documents/projects_gnjatic/MMRF_projects/immune_atlas/RData/results_diag_dirichlet_ND_res_DAA_dream.RDS',dirichlet_ND_res)
saveRDS(file='/Users/gonzae34/Documents/projects_gnjatic/MMRF_projects/immune_atlas/RData/results_diag_dirichlet_ND_comp_res_DAA_dream.RDS',dirichlet_ND_comp_res)

write.csv(file='/Users/gonzae34/Documents/projects_gnjatic/MMRF_projects/immune_atlas/RData/results_diag_risk_res_ND_DAA_dream.csv',risk_res_ND)
write.csv(file='/Users/gonzae34/Documents/projects_gnjatic/MMRF_projects/immune_atlas/RData/results_diag_prog_res_ND_DAA_dream.csv',progression_res_ND)
write.csv(file='/Users/gonzae34/Documents/projects_gnjatic/MMRF_projects/immune_atlas/RData/results_diag_risk_res_DAA_dream.csv',risk_res)
write.csv(file='/Users/gonzae34/Documents/projects_gnjatic/MMRF_projects/immune_atlas/RData/results_diag_prog_res_DAA_dream.csv',progression_res)
write.csv(file='/Users/gonzae34/Documents/projects_gnjatic/MMRF_projects/immune_atlas/RData/results_diag_res_compartment_nd_DAA_dream.csv',res_compartment_nd)
write.csv(file='/Users/gonzae34/Documents/projects_gnjatic/MMRF_projects/immune_atlas/RData/results_diag_prog_res_DAA_dream.csv',progression_res)
write.csv(file='/Users/gonzae34/Documents/projects_gnjatic/MMRF_projects/immune_atlas/RData/results_diag_dirichlet_res_DAA_dream.csv',dirichlet_res)
write.csv(file='/Users/gonzae34/Documents/projects_gnjatic/MMRF_projects/immune_atlas/RData/results_diag_dirichlet_ND_comp_res_DAA_dream.csv',dirichlet_ND_comp_res)